Keras: Regression¶
Using Kepler Satellite Sensor Data to Predict Exoplanet Surface Temperature.
Reference this great blog for machine learning cookbooks: MachineLearningMastery.com “Regression”.
[2]:
import keras
from keras import metrics
from keras.models import Sequential
from keras.callbacks import Callback, History
from keras.layers import Input, Dense, BatchNormalization, Activation, Dropout
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder
import aiqc
from aiqc import datum
Example Data¶
Reference Example Datasets for more information.
[3]:
df = datum.to_pandas('exoplanets.parquet')
[4]:
df.head()
[4]:
| TypeFlag | PlanetaryMassJpt | PeriodDays | SurfaceTempK | DistFromSunParsec | HostStarMassSlrMass | HostStarRadiusSlrRad | HostStarMetallicity | HostStarTempK | |
|---|---|---|---|---|---|---|---|---|---|
| 5 | 0 | 0.2500 | 19.224180 | 707.2 | 650.00 | 1.070 | 1.0200 | 0.12 | 5777.0 |
| 6 | 0 | 0.1700 | 39.031060 | 557.9 | 650.00 | 1.070 | 1.0200 | 0.12 | 5777.0 |
| 7 | 0 | 0.0220 | 1.592851 | 1601.5 | 650.00 | 1.070 | 1.0200 | 0.12 | 5777.0 |
| 15 | 0 | 1.2400 | 2.705782 | 2190.0 | 200.00 | 1.630 | 2.1800 | 0.12 | 6490.0 |
| 16 | 0 | 0.0195 | 1.580404 | 604.0 | 14.55 | 0.176 | 0.2213 | 0.10 | 3250.0 |
[5]:
df.dtypes
[5]:
TypeFlag int64
PlanetaryMassJpt float64
PeriodDays float64
SurfaceTempK float64
DistFromSunParsec float64
HostStarMassSlrMass float64
HostStarRadiusSlrRad float64
HostStarMetallicity float64
HostStarTempK float64
dtype: object
[6]:
import plotly.express as px
Now we’ll plot it to get a feel for it.
[7]:
fig = px.scatter_3d(df, x='HostStarMetallicity', y='HostStarMassSlrMass', z='DistFromSunParsec', color='SurfaceTempK', opacity=0.8, size='PlanetaryMassJpt', color_continuous_scale='Pinkyl')
fig.update_layout({
'font_color': 'white', 'plot_bgcolor': '#202020', 'paper_bgcolor': '#202020', 'height': 900,
'scene':{
'yaxis':{'backgroundcolor':'#202020', 'gridcolor':'#505050'},
'xaxis':{'backgroundcolor':'#202020', 'gridcolor':'#505050'},
'zaxis':{'backgroundcolor':'#202020', 'gridcolor':'#505050'},
}
})
fig.update_traces(marker=dict(line=dict(width=0)))
a) High-Level API¶
Reference High-Level API Docs for more information including how to work with non-tabular data.
Preprocess Data¶
[8]:
splitset = aiqc.Pipeline.Tabular.make(
dataFrame_or_filePath = df
, dtype = None
, label_column = 'SurfaceTempK'
, features_excluded = None
, size_test = 0.22
, size_validation = 0.12
, fold_count = None
, bin_count = 4
, label_encoder = StandardScaler(copy=False)
, feature_encoders = [
{"dtypes": ['float64'], "sklearn_preprocess": RobustScaler(copy=False)},
{"dtypes": ['int64'], "sklearn_preprocess": OneHotEncoder(sparse=False)}]
)
___/ featurecoder_index: 0 \_________
=> The column(s) below matched your filter(s) and were ran through a test-encoding successfully.
['PlanetaryMassJpt', 'PeriodDays', 'DistFromSunParsec', 'HostStarMassSlrMass', 'HostStarRadiusSlrRad', 'HostStarMetallicity', 'HostStarTempK']
=> The remaining column(s) and dtype(s) can be used in downstream Featurecoder(s):
{'TypeFlag': 'int64'}
___/ featurecoder_index: 1 \_________
=> The column(s) below matched your filter(s) and were ran through a test-encoding successfully.
['TypeFlag']
=> Done. All feature column(s) have encoder(s) associated with them.
No more Featurecoders can be added to this Encoderset.
Build Model¶
[9]:
def fn_build(features_shape, label_shape, **hp):
model = Sequential()
model.add(Input(shape=features_shape))
# Example of using hyperparameters to tweak topology.
# with 'block' for each layer.
for b in range(hp['blocks']):
# Example of using hyperparameters to tweak topology.
model.add(Dense(hp['neuron_count']))
# Example of using hyperparameters to tweak topology.
# BatchNorm, Activation, Dropout (B.A.D.)
if (hp['batch_norm'] == True):
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(label_shape[0]))
return model
[10]:
def fn_train(model, loser, optimizer, samples_train, samples_evaluate, **hp):
model.compile(
loss = loser
, optimizer = optimizer
, metrics = ['mean_squared_error']
)
metrics_cuttoffs = [
{"metric":"val_loss", "cutoff":0.025, "above_or_below":"below"},
{"metric":"loss", "cutoff":0.025, "above_or_below":"below"}
]
cutoffs = aiqc.TrainingCallback.Keras.MetricCutoff(metrics_cuttoffs)
model.fit(
samples_train["features"]
, samples_train["labels"]
, validation_data = (
samples_evaluate["features"]
, samples_evaluate["labels"]
)
, verbose = 0
, batch_size = hp['batch_size']
, callbacks=[History(), cutoffs]
, epochs = hp['epoch_count']
)
return model
Stage Experiment¶
[11]:
hyperparameters = {
"batch_size": [3]
, "blocks": [2]
, "batch_norm": [True, False]
, "epoch_count": [75]
, "neuron_count": [24, 36]
, "learning_rate": [0.01]
}
[12]:
queue = aiqc.Experiment.make(
library = "keras"
, analysis_type = "regression"
, fn_build = fn_build
, fn_train = fn_train
, fn_predict = None
, fn_lose = None #optional, boilerplate regression.
, fn_optimize = None
, splitset_id = splitset.id
, foldset_id = None
, repeat_count = 1
, hide_test = False
, hyperparameters = hyperparameters
)
[13]:
queue.run_jobs()
🔮 Training Models 🔮: 100%|██████████████████████████████████████████| 4/4 [00:52<00:00, 13.06s/it]
For more information on visualization of performance metrics, reference the Visualization & Metrics documentation.
b) Low-Level API¶
Reference Low-Level API Docs for more information including how to work with non-tabular data, and defining an optimizer.
Preprocess Data¶
[14]:
dataset = aiqc.Dataset.Tabular.from_pandas(df)
[15]:
label_column = 'SurfaceTempK'
[16]:
label = dataset.make_label(columns=[label_column])
[17]:
labelcoder = label.make_labelcoder(
sklearn_preprocess = StandardScaler(copy=False)
)
[18]:
feature = dataset.make_feature(exclude_columns=[label_column])
[19]:
encoderset = feature.make_encoderset()
[20]:
featurecoder_0 = encoderset.make_featurecoder(
sklearn_preprocess = RobustScaler(copy=False)
, dtypes = ['float64']
)
___/ featurecoder_index: 0 \_________
=> The column(s) below matched your filter(s) and were ran through a test-encoding successfully.
['PlanetaryMassJpt', 'PeriodDays', 'DistFromSunParsec', 'HostStarMassSlrMass', 'HostStarRadiusSlrRad', 'HostStarMetallicity', 'HostStarTempK']
=> The remaining column(s) and dtype(s) can be used in downstream Featurecoder(s):
{'TypeFlag': 'int64'}
[21]:
featurecoder_1 = encoderset.make_featurecoder(
sklearn_preprocess = OneHotEncoder(sparse=False)
, dtypes = ['int64']
)
___/ featurecoder_index: 1 \_________
=> The column(s) below matched your filter(s) and were ran through a test-encoding successfully.
['TypeFlag']
=> Done. All feature column(s) have encoder(s) associated with them.
No more Featurecoders can be added to this Encoderset.
[22]:
splitset = aiqc.Splitset.make(
feature_ids = [feature.id]
, label_id = label.id
, size_test = 0.22
, size_validation = 0.12
)
Build Model¶
[23]:
def fn_build(features_shape, label_shape, **hp):
model = Sequential()
model.add(Input(shape=features_shape))
# Example of using hyperparameters to tweak topology.
# with 'block' for each layer.
for b in range(hp['blocks']):
# Example of using hyperparameters to tweak topology.
model.add(Dense(hp['neuron_count']))
# Example of using hyperparameters to tweak topology.
# BatchNorm, Activation, Dropout (B.A.D.)
if (hp['batch_norm'] == True):
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(label_shape[0]))
return model
[24]:
def fn_train(model, loser, optimizer, samples_train, samples_evaluate, **hp):
model.compile(
loss = loser
, optimizer = optimizer
, metrics = ['mean_squared_error']
)
metrics_cuttoffs = [
{"metric":"val_loss", "cutoff":0.025, "above_or_below":"below"},
{"metric":"loss", "cutoff":0.025, "above_or_below":"below"}
]
cutoffs = aiqc.TrainingCallback.Keras.MetricCutoff(metrics_cuttoffs)
model.fit(
samples_train["features"]
, samples_train["labels"]
, validation_data = (
samples_evaluate["features"]
, samples_evaluate["labels"]
)
, verbose = 0
, batch_size = hp['batch_size']
, callbacks=[History(), cutoffs]
, epochs = hp['epoch_count']
)
return model
[25]:
algorithm = aiqc.Algorithm.make(
library = "keras"
, analysis_type = "regression"
, fn_build = fn_build
, fn_train = fn_train
)
Stage Experiment¶
[26]:
hyperparameters = {
"batch_size": [3]
, "blocks": [2]
, "batch_norm": [True, False]
, "epoch_count": [75]
, "neuron_count": [24, 36]
, "learning_rate": [0.01]
}
[27]:
hyperparamset = algorithm.make_hyperparamset(
hyperparameters = hyperparameters
)
[28]:
queue = algorithm.make_queue(
splitset_id = splitset.id
, hyperparamset_id = hyperparamset.id
, repeat_count = 1
)
[29]:
queue.run_jobs()
🔮 Training Models 🔮: 100%|██████████████████████████████████████████| 4/4 [00:47<00:00, 11.97s/it]
For more information on visualization of performance metrics, reference the Visualization & Metrics documentation.